In [1]:
import glob
import os
import random
import subprocess
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pybedtools as pbt
import seaborn as sns
import cardipspy as cpy
import ciepy
%matplotlib inline
dy_name = 'expression_outliers'
import socket
if socket.gethostname() == 'fl-hn1' or socket.gethostname() == 'fl-hn2':
dy = os.path.join(ciepy.root, 'sandbox', dy_name)
cpy.makedir(dy)
pbt.set_tempdir(dy)
outdir = os.path.join(ciepy.root, 'output', dy_name)
cpy.makedir(outdir)
private_outdir = os.path.join(ciepy.root, 'private_output', dy_name)
cpy.makedir(private_outdir)
In [4]:
fn = os.path.join(ciepy.root, 'output', 'input_data', 'rsem_tpm.tsv')
tpm = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'rnaseq_metadata.tsv')
rna_meta = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'subject_metadata.tsv')
subject_meta = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'wgs_metadata.tsv')
wgs_meta = pd.read_table(fn, index_col=0)
gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)
In [5]:
tdf = subject_meta[subject_meta.estimated_ethnicity == 'EUR']
a = tdf[tdf.family_id.isnull()]
b = tdf.dropna(subset=['family_id'])
b = b.drop_duplicates(subset=['family_id'])
sf = pd.concat([a, b])
print('Number to use for outliers: {}'.format(tdf.shape[0]))
In [6]:
rf = rna_meta[rna_meta.subject_id.apply(lambda x: x in sf.index)]
rf = rf.sort(columns='in_eqtl', ascending=False).drop_duplicates(subset='subject_id')
In [7]:
tpm_f = tpm[rf.index]
tpm_f = tpm_f[(tpm_f == 0).sum(axis=1) <= 4]
tpm_f = np.log10(tpm_f + 1)
In [8]:
sns.jointplot(tpm_f.mean(axis=1), tpm_f.std(axis=1), kind='hex');
In [9]:
tpm_fn = (tpm_f.T - tpm_f.mean(axis=1)).T
tpm_fn = (tpm_fn.T / tpm_fn.std(axis=1)).T
tpm_fn = tpm_fn[gene_info.ix[tpm_fn.index, 'gene_type'] == 'protein_coding']
tpm_fn = tpm_fn[gene_info.ix[tpm_fn.index, 'chrom'].apply(lambda x: x not in ['chrX', 'chrY', 'chrM'])]
In [10]:
tpm_fn.abs().max(axis=1).hist()
plt.xlabel('Max $|z|$-score')
plt.ylabel('Number of genes');
In [11]:
m = tpm_fn.abs().max(axis=1)
In [12]:
m.sort(ascending=False)
In [13]:
gene_info.ix[m.head(10).index]
Out[13]:
In [19]:
sns.clustermap(tpm_fn.ix[m.head(20).index], xticklabels=False,
yticklabels=gene_info.ix[m.head(20).index, 'gene_name']);
In [ ]: